You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
summary(training_data)
## id ms_sub_class ms_zoning lot_frontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## lot_area street alley lot_shape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## land_contour utilities lot_config land_slope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## neighborhood condition1 condition2 bldg_type
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## house_style overall_qual overall_cond year_built
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## year_remod_add roof_style roof_matl exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## exterior2nd mas_vnr_type mas_vnr_area exter_qual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## exter_cond foundation bsmt_qual bsmt_cond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## bsmt_exposure bsmt_fin_type1 bsmt_fin_sf1 bsmt_fin_type2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## bsmt_fin_sf2 bsmt_unf_sf total_bsmt_sf heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## heating_qc central_air electrical x1st_flr_sf
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## x2nd_flr_sf low_qual_fin_sf gr_liv_area bsmt_full_bath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## bsmt_half_bath full_bath half_bath bedroom_abv_gr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## kitchen_abv_gr kitchen_qual tot_rms_abv_grd functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## fireplaces fireplace_qu garage_type garage_yr_blt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## garage_finish garage_cars garage_area garage_qual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## garage_cond paved_drive wood_deck_sf open_porch_sf
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## enclosed_porch x3ssn_porch screen_porch pool_area
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## pool_qc fence misc_feature misc_val
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## mo_sold yr_sold sale_type sale_condition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## sale_price
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
hist(training_data$overall_qual)
plot(training_data$overall_qual, training_data$sale_price)
hist(training_data$year_built)
plot(training_data$year_built, training_data$sale_price)
hist(training_data$year_remod_add)
plot(training_data$year_remod_add, training_data$sale_price)
hist(training_data$bsmt_fin_sf1)
plot(training_data$bsmt_fin_sf1, training_data$sale_price)
hist(training_data$total_bsmt_sf)
plot(training_data$total_bsmt_sf, training_data$sale_price)
hist(training_data$x1st_flr_sf)
plot(training_data$x1st_flr_sf, training_data$sale_price)
hist(training_data$x2nd_flr_sf)
plot(training_data$x2nd_flr_sf, training_data$sale_price)
hist(training_data$gr_liv_area)
plot(training_data$gr_liv_area, training_data$sale_price)
hist(training_data$full_bath)
plot(training_data$full_bath, training_data$sale_price)
hist(training_data$tot_rms_abv_grd)
plot(training_data$tot_rms_abv_grd, training_data$sale_price)
hist(training_data$fireplaces)
plot(training_data$fireplaces, training_data$sale_price)
hist(training_data$garage_cars)
plot(training_data$garage_cars, training_data$sale_price)
hist(training_data$garage_area)
plot(training_data$garage_area, training_data$sale_price)
hist(training_data$wood_deck_sf)
plot(training_data$wood_deck_sf, training_data$sale_price)
hist(training_data$open_porch_sf)
plot(training_data$open_porch_sf, training_data$sale_price)
training_data %>% select_if(., is.character) %>% str()
## tibble [1,460 × 43] (S3: tbl_df/tbl/data.frame)
## $ ms_zoning : chr [1:1460] "RL" "RL" "RL" "RL" ...
## $ street : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
## $ alley : chr [1:1460] NA NA NA NA ...
## $ lot_shape : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
## $ land_contour : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ lot_config : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
## $ land_slope : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ condition1 : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
## $ condition2 : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
## $ bldg_type : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ house_style : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
## $ roof_style : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
## $ roof_matl : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ exterior1st : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ exterior2nd : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ mas_vnr_type : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
## $ exter_qual : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
## $ exter_cond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ foundation : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ bsmt_qual : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
## $ bsmt_cond : chr [1:1460] "TA" "TA" "TA" "Gd" ...
## $ bsmt_exposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
## $ bsmt_fin_type1: chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ bsmt_fin_type2: chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
## $ heating : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
## $ heating_qc : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
## $ central_air : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ electrical : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ kitchen_qual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ functional : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
## $ fireplace_qu : chr [1:1460] NA "TA" "TA" "Gd" ...
## $ garage_type : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ garage_finish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
## $ garage_qual : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ garage_cond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ paved_drive : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ pool_qc : chr [1:1460] NA NA NA NA ...
## $ fence : chr [1:1460] NA NA NA NA ...
## $ misc_feature : chr [1:1460] NA NA NA NA ...
## $ sale_type : chr [1:1460] "WD" "WD" "WD" "WD" ...
## $ sale_condition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
ggplot(training_data) +
geom_boxplot(aes(x=ms_zoning, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=street, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=lot_shape, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=land_contour, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=utilities, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=lot_config, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=land_slope, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=neighborhood, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=condition1, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=condition2, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=bldg_type, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=house_style, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=roof_style, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exterior1st, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exterior2nd, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exter_qual, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exter_cond, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=heating, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=heating_qc, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=central_air, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=electrical, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=kitchen_qual, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=functional, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=garage_type, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=garage_finish, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=sale_type, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=sale_condition, y=sale_price))
d1 <- training_data %>%
select(sale_price, garage_area, x1st_flr_sf)
plot(d1, pch=20 , cex=1.5 , col="purple")
num_items = ncol(d1)
cor_data <- c()
for(i in seq(num_items)) {
for(j in seq(num_items)) {
cor_data <- c(cor_data,cor(d1[[i]],d1[[j]]))
}
}
corr_matrix <- matrix(cor_data, ncol=num_items, nrow=num_items)
cor.test(d1$sale_price, d1$garage_area, conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: d1$sale_price and d1$garage_area
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6024756 0.6435283
## sample estimates:
## cor
## 0.6234314
cor.test(d1$sale_price, d1$x1st_flr_sf, conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: d1$sale_price and d1$x1st_flr_sf
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5841687 0.6266715
## sample estimates:
## cor
## 0.6058522
cor.test(d1$garage_area, d1$x1st_flr_sf, conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: d1$garage_area and d1$x1st_flr_sf
## t = 21.451, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4638446 0.5148798
## sample estimates:
## cor
## 0.4897817
prec_matrix <- solve(corr_matrix)
cp <- corr_matrix %*% prec_matrix
pc <- prec_matrix %*% corr_matrix
lu_decomp <- lu.decomposition(corr_matrix)
l <- lu_decomp$L
u <- lu_decomp$U
lu <- l %*% u
corr_matrix == lu
## [,1] [,2] [,3]
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE
Many times, it makes sense to fit a closed form distribution to
data.
- Select a variable in the Kaggle.com training dataset that is skewed to
the right, shift it so that the minimum value is absolutely above zero
if necessary.
- Then load the MASS package and run fitdistr to fit an exponential
probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).
- Find the optimal value of \(\lambda\)
for this distribution, and then take 1000 samples from this exponential
distribution using this value (e.g., rexp(1000, \(\lambda\))).
- Plot a histogram and compare it with a histogram of your original
variable.
- Using the exponential pdf, find the 5th and 95th percentiles using the
cumulative distribution function (CDF). - Also generate a 95% confidence
interval from the empirical data, assuming normality.
- Finally, provide the empirical 5th percentile and 95th percentile of
the data.
- Discuss.
This variable has a minimum value of 1300
ggplot(training_data) +
geom_histogram(aes(x=lot_area), fill='white', color='black', bins=50)
lot_area <- training_data %>% select(lot_area)
summary(lot_area)
## lot_area
## Min. : 1300
## 1st Qu.: 7554
## Median : 9478
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
lot_area_dist <- MASS::fitdistr(lot_area$lot_area, densfun = 'exponential')
lambda_opt <- lot_area_dist$estimate
set.seed(51323)
sample_dist <- rexp(1000,lambda_opt)
hist(training_data$lot_area)
hist(sample_dist)
qexp(.05, lambda_opt)
## [1] 539.4428
qexp(.95, lambda_opt)
## [1] 31505.6
x <- mean(lot_area$lot_area)
std_dev <- sd(lot_area$lot_area)
n <- length(lot_area$lot_area)
z_score <- qnorm(.975)
(lower_limit <- x - (z_score*(std_dev/sqrt(n))))
## [1] 10004.84
(upper_limit <- x+(z_score*(std_dev/sqrt(n))))
## [1] 11028.81
quantile(lot_area$lot_area, probs=c(0.05, 0.95))
## 5% 95%
## 3311.70 17401.15
sp_quantiles <- quantile(training_data$sale_price)
sp_quantiles
## 0% 25% 50% 75% 100%
## 34900 129975 163000 214000 755000
IQR <- sp_quantiles[[4]]-sp_quantiles[[2]]
upper_limit <- sp_quantiles[[4]] + 1.5*IQR
lower_limit <- sp_quantiles[[2]] - 1.5*IQR
training_adj <- training_data %>% filter(between(sale_price, lower_limit, upper_limit))
training_corr <- cor(training_adj %>% select_if(is.numeric))
training_corr <- round(training_corr, digits=2)
sales_corr <- training_corr[,'sale_price']
sales_corr
## id ms_sub_class lot_frontage lot_area overall_qual
## -0.01 -0.06 NA 0.25 0.78
## overall_cond year_built year_remod_add mas_vnr_area bsmt_fin_sf1
## -0.05 0.56 0.54 NA 0.29
## bsmt_fin_sf2 bsmt_unf_sf total_bsmt_sf x1st_flr_sf x2nd_flr_sf
## 0.01 0.22 0.54 0.52 0.32
## low_qual_fin_sf gr_liv_area bsmt_full_bath bsmt_half_bath full_bath
## -0.06 0.66 0.20 -0.02 0.58
## half_bath bedroom_abv_gr kitchen_abv_gr tot_rms_abv_grd fireplaces
## 0.28 0.20 -0.15 0.47 0.45
## garage_yr_blt garage_cars garage_area wood_deck_sf open_porch_sf
## NA 0.63 0.61 0.30 0.33
## enclosed_porch x3ssn_porch screen_porch pool_area misc_val
## -0.14 0.04 0.11 0.05 -0.01
## mo_sold yr_sold sale_price
## 0.07 -0.03 1.00
training_adj <- training_adj %>% mutate(overall_qual = as.factor(overall_qual))
training_adj <- training_adj %>% mutate(garage_cars = as.factor(garage_cars))
training_adj <- training_adj %>% mutate(year_built_cat = case_when(
year_built < 1980~'pre_1980',
TRUE ~ 'post_1980'
))
training_adj <- training_adj %>% mutate(full_bath = as.factor(full_bath))
training_adj <- training_adj %>% mutate(tot_rms_abv_grd = as.factor(tot_rms_abv_grd))
training_adj <- training_adj %>% mutate(has_2nd_flr = ifelse(x2nd_flr_sf == 0,'no','yes'))
m3 <- lm(sale_price ~ overall_qual + log(gr_liv_area) + garage_cars +
year_built_cat + full_bath + total_bsmt_sf + x1st_flr_sf +
(has_2nd_flr)*(x2nd_flr_sf) + tot_rms_abv_grd +
kitchen_qual + electrical + central_air + exter_qual +
exter_cond + heating + sale_type,
data=training_adj)
summary(m3)
##
## Call:
## lm(formula = sale_price ~ overall_qual + log(gr_liv_area) + garage_cars +
## year_built_cat + full_bath + total_bsmt_sf + x1st_flr_sf +
## (has_2nd_flr) * (x2nd_flr_sf) + tot_rms_abv_grd + kitchen_qual +
## electrical + central_air + exter_qual + exter_cond + heating +
## sale_type, data = training_adj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -148847 -13056 290 12539 123454
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.434e+05 7.864e+04 -3.095 0.002007 **
## overall_qual2 1.004e+04 3.089e+04 0.325 0.745186
## overall_qual3 -1.981e+04 2.908e+04 -0.681 0.495972
## overall_qual4 -7.521e+03 2.857e+04 -0.263 0.792395
## overall_qual5 -8.122e+02 2.872e+04 -0.028 0.977446
## overall_qual6 7.544e+03 2.877e+04 0.262 0.793180
## overall_qual7 2.097e+04 2.881e+04 0.728 0.466728
## overall_qual8 4.521e+04 2.895e+04 1.562 0.118635
## overall_qual9 7.421e+04 2.969e+04 2.499 0.012558 *
## overall_qual10 -3.258e+04 3.239e+04 -1.006 0.314707
## log(gr_liv_area) 7.083e+04 1.194e+04 5.933 3.79e-09 ***
## garage_cars1 9.305e+03 3.376e+03 2.756 0.005923 **
## garage_cars2 1.756e+04 3.425e+03 5.128 3.35e-07 ***
## garage_cars3 3.282e+04 4.390e+03 7.477 1.37e-13 ***
## garage_cars4 3.913e+04 1.187e+04 3.295 0.001009 **
## year_built_catpre_1980 -6.168e+03 2.413e+03 -2.556 0.010693 *
## full_bath1 -4.545e+03 9.852e+03 -0.461 0.644662
## full_bath2 -5.593e+03 9.897e+03 -0.565 0.572066
## full_bath3 2.548e+03 1.165e+04 0.219 0.826923
## total_bsmt_sf 1.528e+01 3.045e+00 5.018 5.92e-07 ***
## x1st_flr_sf 3.273e+00 8.520e+00 0.384 0.700911
## has_2nd_flryes -3.513e+04 4.285e+03 -8.199 5.63e-16 ***
## x2nd_flr_sf 3.997e+01 8.177e+00 4.888 1.14e-06 ***
## tot_rms_abv_grd3 -6.355e+04 3.801e+04 -1.672 0.094784 .
## tot_rms_abv_grd4 -6.679e+04 3.766e+04 -1.774 0.076336 .
## tot_rms_abv_grd5 -6.785e+04 3.782e+04 -1.794 0.073065 .
## tot_rms_abv_grd6 -7.054e+04 3.798e+04 -1.858 0.063442 .
## tot_rms_abv_grd7 -7.113e+04 3.806e+04 -1.869 0.061884 .
## tot_rms_abv_grd8 -7.442e+04 3.812e+04 -1.952 0.051134 .
## tot_rms_abv_grd9 -6.817e+04 3.822e+04 -1.784 0.074692 .
## tot_rms_abv_grd10 -7.737e+04 3.845e+04 -2.012 0.044384 *
## tot_rms_abv_grd11 -1.031e+05 3.898e+04 -2.646 0.008250 **
## tot_rms_abv_grd12 -1.264e+05 3.931e+04 -3.216 0.001333 **
## tot_rms_abv_grd14 -8.526e+04 4.724e+04 -1.805 0.071326 .
## kitchen_qualFa -2.712e+04 6.390e+03 -4.244 2.35e-05 ***
## kitchen_qualGd -1.677e+04 3.985e+03 -4.208 2.75e-05 ***
## kitchen_qualTA -2.612e+04 4.228e+03 -6.178 8.61e-10 ***
## electricalFuseF -1.119e+03 5.804e+03 -0.193 0.847124
## electricalFuseP -6.052e+03 1.528e+04 -0.396 0.692074
## electricalMix -4.419e+04 2.562e+04 -1.725 0.084765 .
## electricalSBrkr 6.404e+03 2.897e+03 2.211 0.027234 *
## central_airY 1.721e+04 3.526e+03 4.883 1.17e-06 ***
## exter_qualFa -3.331e+04 1.083e+04 -3.075 0.002150 **
## exter_qualGd -1.188e+04 6.669e+03 -1.782 0.074982 .
## exter_qualTA -1.874e+04 6.930e+03 -2.704 0.006932 **
## exter_condFa -4.960e+04 1.618e+04 -3.065 0.002221 **
## exter_condGd -3.739e+04 1.541e+04 -2.427 0.015368 *
## exter_condPo -7.224e+04 3.035e+04 -2.381 0.017420 *
## exter_condTA -4.076e+04 1.528e+04 -2.668 0.007721 **
## heatingGasA -2.290e+04 2.603e+04 -0.879 0.379307
## heatingGasW -3.121e+04 2.663e+04 -1.172 0.241532
## heatingGrav -1.441e+04 2.768e+04 -0.520 0.602827
## heatingOthW -8.464e+04 3.157e+04 -2.681 0.007427 **
## heatingWall -1.114e+04 2.850e+04 -0.391 0.695874
## sale_typeCon 6.544e+04 1.813e+04 3.610 0.000318 ***
## sale_typeConLD 9.236e+03 9.600e+03 0.962 0.336171
## sale_typeConLI -7.055e+03 1.314e+04 -0.537 0.591424
## sale_typeConLw 1.263e+04 1.240e+04 1.018 0.308646
## sale_typeCWD 2.814e+04 1.337e+04 2.105 0.035485 *
## sale_typeNew 1.595e+04 4.912e+03 3.247 0.001197 **
## sale_typeOth 1.533e+04 1.514e+04 1.012 0.311608
## sale_typeWD 1.479e+04 3.931e+03 3.763 0.000175 ***
## has_2nd_flryes:x2nd_flr_sf NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24850 on 1336 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8317, Adjusted R-squared: 0.8241
## F-statistic: 108.3 on 61 and 1336 DF, p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(m3)
## Warning: not plotting observations with leverage one:
## 241, 359, 380, 508, 607, 1265
hist(resid(m3))
test_data <- read_delim('/Users/korymartin/Library/Mobile Documents/com~apple~CloudDocs/Grad Programs/CUNY SPS/DATA 605/Final Exam/train.csv')
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- clean_names(test_data)
test_data <- test_data %>% mutate(overall_qual = as.factor(overall_qual))
test_data <- test_data %>% mutate(garage_cars = as.factor(garage_cars))
test_data <- test_data %>% mutate(year_built_cat = case_when(
year_built < 1980~'pre_1980',
TRUE ~ 'post_1980'
))
test_data <- test_data %>% mutate(full_bath = as.factor(full_bath))
test_data <- test_data %>% mutate(tot_rms_abv_grd = as.factor(tot_rms_abv_grd))
test_data <- test_data %>% mutate(has_2nd_flr = ifelse(x2nd_flr_sf == 0,'no','yes'))
predict.y <- predict(m3, test_data)
## Warning in predict.lm(m3, test_data): prediction from a rank-deficient fit may
## be misleading
actual.y <- test_data$sale_price
test_data2 <- tibble(cbind(test_data, data.frame(predict.y)))
seq_count <- seq(1459)
test_data2 <- test_data2 %>% select(id, predict.y) %>% rename(SalePrice = predict.y, Id = id)
test_data2 <- test_data2 %>% drop_na(SalePrice)
test_data2 <- cbind(test_data2, seq_count)
test_data2 <- tibble(test_data2)
test_data2 <- test_data2 %>% mutate(Id = (seq_count+1460))
test_data2 <- test_data2 %>% select(Id, SalePrice)
write_csv(test_data2, './test_data2.csv')